#!/usr/bin/python
# This script extracts orthologous genes present in a given group of bacteria
# according to orthologous group for Muscle alignment. Needs 2 file types:
# 1: Matrix file of orthologous genes in this format -
#CT0191  PG_1118 BT_4597 FP1765  BF1205  .... SGRA_2762
#CT1855  PG_0268 BT_3354 FP1775  BF0215  .... SGRA_4045
#CT1480  PG_2124 BT_4263 FP1613  BF0967  .... SGRA_3985
# 2: Sequence multi-fasta containing all the orthologous genes

import sys
from Bio import SeqIO

matrix_file = sys.argv[1]
seqs_file = sys.argv[2]
out_sfx = sys.argv[3]
seqs = SeqIO.parse(seqs_file, "fasta")
seqs_dict = {}
for seq in seqs:
    seqs_dict[seq.id] = seq

mlines = open(matrix_file, "rU").readlines()

#out_suffix = ".46.28.indiv.fasta"
out_suffix = "." + out_sfx

for index, line in enumerate(mlines):
    l = line.split('\t')
    num = len(l)
    i = 0
    to_extract = []
    while i < num:
        seq_id = l[i].strip()
        if seq_id in seqs_dict:
            to_extract.append(seqs_dict[seq_id])
        i += 1
    #oh = str(index+1).zfill(3) + out_suffix
    oh = l[0] + "." + str(index+1).zfill(3) + out_suffix
    out_handle = open(oh, "w")
    SeqIO.write(to_extract, out_handle, "fasta")

